
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% This file computes the AKM estimates   %
% The relevant output files are AKMeffs.txt %
% code is based on CCK (2015)                   %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


path(path,'D:\Workdata\702525\MATLAB_AKM\matlab_bgl\'); %path to the matlabBGL files 

diary('I:\Workdata\702525\Alex\MA\AKM\personyear_panel_over5.log')

clear all

%LOAD DATA  
s=['I:\Workdata\702525\Alex\MA\manager\personyear_panel_cvrnr_over5.raw'];
data=importdata(s);

%impose an initial sample restriction
year=data(:,2);
%age=data(:,5);
%exp=data(:,7);

%if i==1
    sel =(year>1990);
%end
% if i==2
%     sel =(year<=1996);
% end
% if i==3
%     sel =(year>1996);
% end

data=data(sel,:);

id=data(:,1);
year=data(:,2);
firmid=data(:,3);
y=data(:,4);
age=data(:,5);
educ=data(:,6);
exp=data(:,7);
%vapp=data(:,8);
%salepp=data(:,9);
clear data

lagid=[NaN;id(1:end-1)];
sameid=lagid==id;
lagfirmid=[NaN;firmid(1:end-1)];
lagfirmid(~sameid)=NaN;

%recode education
educ(educ==1)=0;
educ(educ==2)=1;
educ(educ==3)=2;
educ(educ==4)=3;


% fprintf('\n')
% disp('Recoded education tab')
% tabulate(educ)
% disp('Note: 0 and 4 years of schooling lumped together into group 0')




firmid_old=firmid;
id_old=id;

% fprintf('\n')
% if i==1
%     big=fsize==max(fsize);
%     ref=min(firmid(big));
%     s=['Reference firm is: ' int2str(ref)];
%     disp(s)
%     s=['Reference firm has minimum size: ' int2str(min(fsize(firmid==ref)))];
%     disp(s)
%     maxid=max(firmid);
%     firmid(firmid==ref)=maxid+1;
%     lagfirmid(lagfirmid==ref)=maxid+1;
% end
% if i>1
%     disp('Double check..')
%     s=['Reference firm has minimum size: ' int2str(min(fsize(firmid==ref)))];
%     disp(s)
%     firmid(firmid==ref)=maxid+1;
%     lagfirmid(lagfirmid==ref)=maxid+1;
% end
% fprintf('\n')

lagfirmid(lagfirmid==-9)=NaN;


%RENAME
disp('Relabeling ids...')
N=length(y);
sel=~isnan(lagfirmid);

%relabel the firms
[firms,m,n]=unique([firmid;lagfirmid(sel)]);

firmid=n(1:N);
lagfirmid(sel)=n(N+1:end);


%relabel the workers
[ids,m,n]=unique(id);
id=n;

%initial descriptive stats
fprintf('\n')
disp('Some descriptive stats:')
s=['# of p-y obs: ' int2str(length(y))];
disp(s);
s=['# of workers: ' int2str(max(id))];
disp(s);
s=['# of firms: ' int2str(max(firmid))];
disp(s);

s=['mean wage: ' num2str(mean(y))];
disp(s)
s=['variance of wage: ' num2str(var(y))];
disp(s)
fprintf('\n')

%FIND CONNECTED SET
disp('Finding connected set...')
A=sparse(lagfirmid(sel),firmid(sel),1); %adjacency matrix
%make it square
[m,n]=size(A);
if m>n
    A=[A,zeros(m,m-n)];
end
if m<n
    A=[A;zeros(n-m,n)];
end
A=max(A,A'); %connections are undirected

[sindex, sz]=components(A); %get connected sets
idx=find(sz==max(sz)); %find largest set
s=['# of firms: ' int2str(length(A))];
disp(s);
s=['# connected sets:' int2str(length(sz))];
disp(s);
s=['Largest connected set contains ' int2str(max(sz)) ' firms'];
disp(s);
fprintf('\n')
clear A lagfirmid
firmlst=find(sindex==idx); %firms in connected set
sel=ismember(firmid,firmlst);

y=y(sel); firmid=firmid(sel); id=id(sel);
year=year(sel); id_old=id_old(sel); firmid_old=firmid_old(sel);
educ=educ(sel); N=length(y); 
age=age(sel); exp=exp(sel); 
%vapp=vapp(sel); salepp=salepp(sel);

disp('Relabeling ids again...')
%relabel the firms
[firms,m,n]=unique(firmid);
firmid=n;

%relabel the workers
[ids,m,n]=unique(id);
id=n;

%descriptive stats for connected set
fprintf('\n')
disp('Now restricted to the largest connected set');
s=['# of p-y obs: ' int2str(length(y))];
disp(s);
s=['# of workers: ' int2str(max(id))];
disp(s);
s=['# of firms: ' int2str(max(firmid))];
disp(s);

s=['mean wage: ' num2str(mean(y))];
disp(s)
s=['variance of wage: ' num2str(var(y))];
disp(s)
fprintf('\n')

%ESTIMATE AKM
disp('Building matrices...')
NT=length(y);
N=max(id);
J=max(firmid);

D=sparse(1:NT,id',1);
F=sparse(1:NT,firmid',1);

S=speye(J-1);
S=[S;sparse(-zeros(1,J-1))];  %N+JxN+J-1 restriction matrix 


E=sparse(1:NT,educ+1,1);
disp('Age tabulation')
tabulate(age)
age=(age-40)/40; %normalize to age 40 and rescale to avoid big numbers
A=[bsxfun(@times,E,age.^2),bsxfun(@times,E,age.^3)]; %age cubic by education

%Z=A;
yrmin=min(year);
R=sparse(1:NT,year-yrmin+1,1); %year effects
R(:,1)=[];

%vapp=(vapp-4000)/8000;
%V=[bsxfun(@times,E,vapp)];
%Z=[R, A, V];
Z=[R, A];

clear R E A


X=[D,F*S,Z];

disp('Running AKM...')
tic
xx=X'*X;
xy=X'*y;
L=ichol(xx,struct('type','ict','droptol',1e-2,'diagcomp',.2));
b=pcg(xx,xy,1e-10,1000,L,L');
toc
disp('Done')
clear xx xy L

%ANALYZE RESULTS
xb=X*b;
r=y-xb;

disp('Goodness of fit:')
dof=NT-J-N+1-size(Z,2)
RMSE=sqrt(sum(r.^2)/dof)

TSS=sum((y-mean(y)).^2);
R2=1-sum(r.^2)/TSS
adjR2=1-sum(r.^2)/TSS*(NT-1)/dof

ahat=b(1:N);
ghat=b(N+1:N+J-1);
bhat=b(N+J:end);
disp('check for problems with year effects. should report zero')
sum(bhat==0)

pe=D*ahat;
fe=F*S*ghat;
xb=X(:,N+J:end)*bhat;

clear D F X b

disp('Variance-Covariance of worker and firm effs (p-y weighted)');
cov(pe,fe)
disp('Correlation coefficient');
corr(pe,fe)
disp('Means of person/firm effs')
mean([pe,fe])

disp('Full Covariance Matrix of Components')
disp('    y      pe      fe      xb      r')
C=cov([y,pe,fe,xb,r])


%SAVE OUTPUT (this will write to the current directory)
disp('Saving main effects')
out=[id_old firmid_old year pe fe xb];
s=['I:\Workdata\702525\Alex\MA\manager\cvrnryear_fe_over5.txt'];
dlmwrite(s, out, 'delimiter', '\t', 'precision', 16);

diary off;
